In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

Figure 2.1


In [5]:
data = pd.read_csv('data/Advertising.csv')
data.head()


Out[5]:
Unnamed: 0 TV Radio Newspaper Sales
0 1 230.1 37.8 69.2 22.1
1 2 44.5 39.3 45.1 10.4
2 3 17.2 45.9 69.3 9.3
3 4 151.5 41.3 58.5 18.5
4 5 180.8 10.8 58.4 12.9

In [6]:
plt.figure(figsize=(18, 6))
plt.subplot(131)
sns.regplot(x='TV', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 310)
plt.subplot(132)
sns.regplot(x='Radio', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 55)
plt.subplot(133)
sns.regplot(x='Newspaper', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 110)


Out[6]:
(-5, 110)

Subsquent figures use data that is not available

Exercise 2.8

8a


In [7]:
college = pd.read_csv('data/College.csv')
college.head()


Out[7]:
Unnamed: 0 Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
0 Abilene Christian University Yes 1660 1232 721 23 52 2885 537 7440 3300 450 2200 70 78 18.1 12 7041 60
1 Adelphi University Yes 2186 1924 512 16 29 2683 1227 12280 6450 750 1500 29 30 12.2 16 10527 56
2 Adrian College Yes 1428 1097 336 22 50 1036 99 11250 3750 400 1165 53 66 12.9 30 8735 54
3 Agnes Scott College Yes 417 349 137 60 89 510 63 12960 5450 450 875 92 97 7.7 37 19016 59
4 Alaska Pacific University Yes 193 146 55 16 44 249 869 7560 4120 800 1500 76 72 11.9 2 10922 15

8b


In [8]:
college.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)
college.head(2)


Out[8]:
Name Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
0 Abilene Christian University Yes 1660 1232 721 23 52 2885 537 7440 3300 450 2200 70 78 18.1 12 7041 60
1 Adelphi University Yes 2186 1924 512 16 29 2683 1227 12280 6450 750 1500 29 30 12.2 16 10527 56

8c i


In [9]:
college.describe()


Out[9]:
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
count 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.000000 777.00000
mean 3001.638353 2018.804376 779.972973 27.558559 55.796654 3699.907336 855.298584 10440.669241 4357.526384 549.380952 1340.642214 72.660232 79.702703 14.089704 22.743887 9660.171171 65.46332
std 3870.201484 2451.113971 929.176190 17.640364 19.804778 4850.420531 1522.431887 4023.016484 1096.696416 165.105360 677.071454 16.328155 14.722359 3.958349 12.391801 5221.768440 17.17771
min 81.000000 72.000000 35.000000 1.000000 9.000000 139.000000 1.000000 2340.000000 1780.000000 96.000000 250.000000 8.000000 24.000000 2.500000 0.000000 3186.000000 10.00000
25% 776.000000 604.000000 242.000000 15.000000 41.000000 992.000000 95.000000 7320.000000 3597.000000 470.000000 850.000000 62.000000 71.000000 11.500000 13.000000 6751.000000 53.00000
50% 1558.000000 1110.000000 434.000000 23.000000 54.000000 1707.000000 353.000000 9990.000000 4200.000000 500.000000 1200.000000 75.000000 82.000000 13.600000 21.000000 8377.000000 65.00000
75% 3624.000000 2424.000000 902.000000 35.000000 69.000000 4005.000000 967.000000 12925.000000 5050.000000 600.000000 1700.000000 85.000000 92.000000 16.500000 31.000000 10830.000000 78.00000
max 48094.000000 26330.000000 6392.000000 96.000000 100.000000 31643.000000 21836.000000 21700.000000 8124.000000 2340.000000 6800.000000 103.000000 100.000000 39.800000 64.000000 56233.000000 118.00000

8c ii


In [10]:
sns.pairplot(college.iloc[:, 2:11])


Out[10]:
<seaborn.axisgrid.PairGrid at 0x8a5f828>

8c iii


In [11]:
sns.boxplot(x='Private', y='Outstate', data=college)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0xd6bcfd0>

8c iv


In [12]:
college.loc[college.loc[:, 'Top10perc'] > 50, 'Elite'] = 'Yes'
college.loc[college.loc[:, 'Top10perc'] <= 50, 'Elite'] = 'No'
print(college.Elite.value_counts())
sns.boxplot(x='Elite', y='Outstate', data=college)


No     699
Yes     78
dtype: int64
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0xe1d8860>

8c v


In [13]:
plt.figure(figsize=(10,10))
plt.subplot(221)
sns.distplot(college['Apps'], kde=False, bins=20)
plt.subplot(222)
sns.distplot(college['Apps'], kde=False, bins=100)
plt.subplot(223)
sns.distplot(college['Outstate'], kde=False, bins=20)
plt.subplot(224)
sns.distplot(college['Outstate'], kde=False, bins=100)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x108bbda0>

Exercise 2.9

9a


In [14]:
auto = pd.read_csv('data/Auto.csv')
auto.head()


Out[14]:
mpg cylinders displacement horsepower weight acceleration year origin name
0 18 8 307 130 3504 12.0 70 1 chevrolet chevelle malibu
1 15 8 350 165 3693 11.5 70 1 buick skylark 320
2 18 8 318 150 3436 11.0 70 1 plymouth satellite
3 16 8 304 150 3433 12.0 70 1 amc rebel sst
4 17 8 302 140 3449 10.5 70 1 ford torino

9b and 9c


In [15]:
auto.describe()


Out[15]:
mpg cylinders displacement weight acceleration year origin
count 397.000000 397.000000 397.000000 397.000000 397.000000 397.000000 397.000000
mean 23.515869 5.458438 193.532746 2970.261965 15.555668 75.994962 1.574307
std 7.825804 1.701577 104.379583 847.904119 2.749995 3.690005 0.802549
min 9.000000 3.000000 68.000000 1613.000000 8.000000 70.000000 1.000000
25% 17.500000 4.000000 104.000000 2223.000000 13.800000 73.000000 1.000000
50% 23.000000 4.000000 146.000000 2800.000000 15.500000 76.000000 1.000000
75% 29.000000 8.000000 262.000000 3609.000000 17.100000 79.000000 2.000000
max 46.600000 8.000000 455.000000 5140.000000 24.800000 82.000000 3.000000

9d


In [17]:
ss1 = auto[:10]
ss2 = auto[85:]
subset = pd.concat([ss1, ss2])
subset.describe()


Out[17]:
mpg cylinders displacement weight acceleration year origin
count 322.000000 322.000000 322.000000 322.000000 322.000000 322.000000 322.000000
mean 24.409317 5.378882 187.680124 2936.807453 15.700621 77.130435 1.596273
std 7.913357 1.657398 100.120925 810.987533 2.706436 3.131849 0.815572
min 11.000000 3.000000 68.000000 1649.000000 8.500000 70.000000 1.000000
25% 18.000000 4.000000 100.250000 2216.000000 14.000000 75.000000 1.000000
50% 23.900000 4.000000 145.500000 2797.500000 15.500000 77.000000 1.000000
75% 30.650000 6.000000 250.000000 3516.000000 17.275000 80.000000 2.000000
max 46.600000 8.000000 455.000000 4997.000000 24.800000 82.000000 3.000000

9e


In [18]:
sns.pairplot(auto.iloc[:, :8])


Out[18]:
<seaborn.axisgrid.PairGrid at 0x107c0f60>

9f


In [22]:
plt.figure(figsize=(12,6))
plt.subplot(121)
sns.boxplot(x='cylinders', y='mpg', data=auto)
plt.subplot(122)
sns.regplot(x='weight', y='mpg', data=auto, scatter_kws={'color': 'red'})


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x150f86d8>

Exercise 2.10

10a and d


In [24]:
boston = pd.read_csv('data/Boston.csv')
boston.describe()


Out[24]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 253.500000 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 146.213884 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 1.000000 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 127.250000 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 253.500000 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 379.750000 3.677082 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 506.000000 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000

10b and c


In [25]:
sns.pairplot(boston[1:])


Out[25]:
<seaborn.axisgrid.PairGrid at 0x152dc2b0>

10 e


In [27]:
boston.chas.sum() # value is 1 if next to Charles otherwise value is zero so sum is number next to Charles


Out[27]:
35

10 f


In [28]:
boston.ptratio.median()


Out[28]:
19.05

10 g


In [30]:
boston[boston.medv == boston.medv.min()]


Out[30]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
398 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 396.90 30.59 5
405 406 67.9208 0 18.1 0 0.693 5.683 100 1.4254 24 666 20.2 384.97 22.98 5

10 h


In [31]:
boston[boston.rm >= 7].describe()


Out[31]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
count 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000 64.000000
mean 224.015625 0.979109 28.171875 5.775625 0.125000 0.504455 7.570094 60.640625 4.199617 5.984375 312.234375 16.259375 388.275156 5.474062 38.396875
std 93.675308 2.807599 34.053089 5.544494 0.333333 0.092863 0.481467 27.858587 2.074423 5.655429 118.311365 2.351407 9.487005 2.906582 8.722639
min 3.000000 0.009060 0.000000 0.460000 0.000000 0.394000 7.007000 8.400000 1.202400 1.000000 193.000000 12.600000 354.310000 1.730000 15.000000
25% 186.000000 0.045023 0.000000 2.460000 0.000000 0.430250 7.183250 36.000000 2.444925 3.000000 244.750000 14.700000 384.922500 3.555000 32.975000
50% 230.500000 0.097860 20.000000 3.970000 0.000000 0.488000 7.414000 63.800000 3.495200 5.000000 273.000000 17.400000 390.660000 4.775000 36.450000
75% 270.250000 0.542893 45.000000 6.200000 0.000000 0.582500 7.858500 85.025000 5.462925 7.000000 329.000000 17.925000 395.305000 6.590000 46.175000
max 483.000000 19.609100 95.000000 19.580000 1.000000 0.718000 8.780000 100.000000 9.222900 24.000000 666.000000 20.200000 396.900000 16.740000 50.000000

In [32]:
boston[boston.rm >= 8].describe()


Out[32]:
Unnamed: 0 crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
count 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000 13.000000
mean 232.307692 0.718795 13.615385 7.078462 0.153846 0.539238 8.348538 71.538462 3.430192 7.461538 325.076923 16.361538 385.210769 4.310000 44.200000
std 60.915768 0.901640 26.298094 5.392767 0.375534 0.092352 0.251261 24.608723 1.883955 5.332532 110.971063 2.410580 10.529359 1.373566 8.092383
min 98.000000 0.020090 0.000000 2.680000 0.000000 0.416100 8.034000 8.400000 1.801000 2.000000 224.000000 13.000000 354.550000 2.470000 21.900000
25% 225.000000 0.331470 0.000000 3.970000 0.000000 0.504000 8.247000 70.400000 2.288500 5.000000 264.000000 14.700000 384.540000 3.320000 41.700000
50% 233.000000 0.520140 0.000000 6.200000 0.000000 0.507000 8.297000 78.300000 2.894400 7.000000 307.000000 17.400000 386.860000 4.140000 48.300000
75% 258.000000 0.578340 20.000000 6.200000 0.000000 0.605000 8.398000 86.500000 3.651900 8.000000 307.000000 17.400000 389.700000 5.120000 50.000000
max 365.000000 3.474280 95.000000 19.580000 1.000000 0.718000 8.780000 93.900000 8.906700 24.000000 666.000000 20.200000 396.900000 7.440000 50.000000

In [ ]: